1. How do trail visitor counts correlate to an increase in snowfall during the previous day(s)? (what patterns of snowfall yield the most visitors/changes in visitation)

ggplot(winter_travel, aes(change_depth, user.count, color = Trailhead))+
  geom_jitter(height = 0, width = 0.2, alpha = 0.5)+
  facet_grid(modality~year)+
  scale_y_log10()

ggplot(winter_travel, aes(change_depth, user.count, color = rating_near))+
  geom_jitter(height = 0, width = 0.2, alpha = 0.7)+
  scale_color_manual(values = c("green", "yellow", "orange", "red"))+
  scale_y_log10()+
  facet_grid(modality~Trailhead)

ggplot(winter_travel, aes(past3snow, user.count, color = rating_near))+
  geom_jitter(height = 0, width = 0.2, alpha = 0.7)+
  scale_color_manual(values = c("green", "yellow", "orange", "red"))+
  scale_y_log10()+
  facet_grid(modality~Trailhead)

ggplot(winter_travel, aes(past5snow, user.count, color = rating_near))+
  geom_jitter(height = 0, width = 0.2, alpha = 0.7)+
  scale_color_manual(values = c("green", "yellow", "orange", "red"))+
  scale_y_log10()+
  facet_grid(modality~Trailhead)

ggplot(all_users, aes(change_depth, user.count, color = rating_near))+
  geom_jitter(height = 0, width = 0.2, alpha = 0.7)+
  scale_color_manual(values = c("green", "yellow", "orange", "red"))+
  scale_y_log10()

ggplot(all_users, aes(air_temp, user.count, color = rating_near))+
  geom_jitter(height = 0, width = 0.2, alpha = 0.7)+
  scale_color_manual(values = c("green", "yellow", "orange", "red"))+
  scale_y_log10()

ggplot(all_users, aes(lag2snow, user.count, color = rating_near))+
  geom_jitter(height = 0, width = 0.2, alpha = 0.7)+
  scale_color_manual(values = c("green", "yellow", "orange", "red"))+
  scale_y_log10()

ggplot(all_users, aes(lag3snow, user.count, color = rating_near))+
  geom_jitter(height = 0, width = 0.2, alpha = 0.7)+
  scale_color_manual(values = c("green", "yellow", "orange", "red"))+
  scale_y_log10()

ggplot(all_users, aes(lag4snow, user.count, color = rating_near))+
  geom_jitter(height = 0, width = 0.2, alpha = 0.7)+
  scale_color_manual(values = c("green", "yellow", "orange", "red"))+
  scale_y_log10()

pairs(winter_travel[,c("change_depth","past2snow","past3snow","past4snow","past5snow")])

pairs(winter_travel[,c("change_depth","lag2snow","lag3snow","lag4snow")])

Poisson models

glm0 <- glm(user.count ~ 1, data=all_users)
summary(glm0)
## 
## Call:
## glm(formula = user.count ~ 1, data = all_users)
## 
## Deviance Residuals: 
##    Min      1Q  Median      3Q     Max  
## -95.28  -54.28  -19.28   32.72  509.72  
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)   95.280      2.807   33.95   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for gaussian family taken to be 6129.417)
## 
##     Null deviance: 4762557  on 777  degrees of freedom
## Residual deviance: 4762557  on 777  degrees of freedom
## AIC: 8995.7
## 
## Number of Fisher Scoring iterations: 2
glm1 <- glm(user.count ~ change_depth+lag2snow+lag3snow+lag4snow+air_temp, family = poisson, data=all_users)
summary(glm1)
## 
## Call:
## glm(formula = user.count ~ change_depth + lag2snow + lag3snow + 
##     lag4snow + air_temp, family = poisson, data = all_users)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -13.808   -6.263   -2.039    3.132   35.855  
## 
## Coefficients:
##                Estimate Std. Error z value Pr(>|z|)    
## (Intercept)   4.4291357  0.0129598 341.760   <2e-16 ***
## change_depth -0.0208990  0.0019166 -10.904   <2e-16 ***
## lag2snow     -0.0025174  0.0018126  -1.389   0.1649    
## lag3snow     -0.0031262  0.0017749  -1.761   0.0782 .  
## lag4snow     -0.0014054  0.0017575  -0.800   0.4239    
## air_temp      0.0052044  0.0004812  10.816   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for poisson family taken to be 1)
## 
##     Null deviance: 43596  on 777  degrees of freedom
## Residual deviance: 43281  on 772  degrees of freedom
## AIC: 47963
## 
## Number of Fisher Scoring iterations: 5
hist(resid(glm1))

confint(glm1)
## Waiting for profiling to be done...
##                     2.5 %        97.5 %
## (Intercept)   4.403701411  4.4545029514
## change_depth -0.024665112 -0.0171522714
## lag2snow     -0.006078060  0.0010273450
## lag3snow     -0.006612256  0.0003450671
## lag4snow     -0.004858437  0.0020309387
## air_temp      0.004261695  0.0061478751

Don’t like this model. AIC went way up adding predictors. Overdispersion is definitely present. Negative binomial is probably a better choice for the conditional distribution.

Negative binomial models

glm0nb <- glm.nb(user.count ~ weekend, data=all_users)
summary(glm0nb)
## 
## Call:
## glm.nb(formula = user.count ~ weekend, data = all_users, init.theta = 1.630430642, 
##     link = log)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -3.7985  -0.8809  -0.1989   0.4486   2.7430  
## 
## Coefficients:
##                Estimate Std. Error z value Pr(>|z|)    
## (Intercept)     4.37478    0.03361 130.152   <2e-16 ***
## weekendweekend  0.52678    0.06246   8.434   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for Negative Binomial(1.6304) family taken to be 1)
## 
##     Null deviance: 944.56  on 777  degrees of freedom
## Residual deviance: 868.75  on 776  degrees of freedom
## AIC: 8523.8
## 
## Number of Fisher Scoring iterations: 1
## 
## 
##               Theta:  1.6304 
##           Std. Err.:  0.0796 
## 
##  2 x log-likelihood:  -8517.7890
glm1nb <- glm.nb(user.count ~ change_depth+lag3snow+air_temp+weekend, data=all_users)
summary(glm1nb)
## 
## Call:
## glm.nb(formula = user.count ~ change_depth + lag3snow + air_temp + 
##     weekend, data = all_users, init.theta = 1.648230555, link = log)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -3.7780  -0.8922  -0.2172   0.4375   2.8737  
## 
## Coefficients:
##                 Estimate Std. Error z value Pr(>|z|)    
## (Intercept)     4.212314   0.097600  43.159   <2e-16 ***
## change_depth   -0.027548   0.014053  -1.960   0.0500 *  
## lag3snow        0.016459   0.013747   1.197   0.2312    
## air_temp        0.006140   0.003631   1.691   0.0909 .  
## weekendweekend  0.545700   0.062897   8.676   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for Negative Binomial(1.6482) family taken to be 1)
## 
##     Null deviance: 954.29  on 777  degrees of freedom
## Residual deviance: 868.34  on 773  degrees of freedom
## AIC: 8520.5
## 
## Number of Fisher Scoring iterations: 1
## 
## 
##               Theta:  1.6482 
##           Std. Err.:  0.0806 
## 
##  2 x log-likelihood:  -8508.5040
hist(resid(glm1nb))

confint(glm1nb)
## Waiting for profiling to be done...
##                       2.5 %      97.5 %
## (Intercept)     4.017397191 4.411521222
## change_depth   -0.055587761 0.001578779
## lag3snow       -0.010845054 0.044667768
## air_temp       -0.001196478 0.013406582
## weekendweekend  0.423396724 0.669979550

Looking at AIC, adding predictors gives a slight improvement. Looks like about a 2.7% decrease in users per cm of new snow. Maybe about an 0.6% increase in users per degC of air temp. Recent snow doesn’t register a strong effect.

2. How does an increased avalanche risk impact the numbers of hybrid and motorized visitors to specific trailheads?

ggplot(na.omit(winter_travel), aes(rating_near, user.count, color = Trailhead, fill = Trailhead))+
  geom_point(position=position_jitterdodge(jitter.width = 0.2))+
  geom_boxplot(color="black", outlier.shape = NA)+
  scale_y_log10()+
  facet_wrap(~has_sled)

ggplot(na.omit(winter_travel), aes(rating_near, user.count))+
  geom_jitter()+
  geom_boxplot(outlier.shape = NA)+
  scale_y_log10()+
  facet_grid(has_sled~Trailhead)

We’ll model the trailheads independently since they are so different. Lots of things to say, but one note is that at Kebler, only high ratings lead to statistically significant drops in use, but at Slate, there’s a noticable drop at considerable and maybe even moderate.

glm2brushrd <- glm.nb(user.count ~ rating_near, 
                  data=winter_travel[(winter_travel$has_sled==TRUE)&(winter_travel$Trailhead=="Brush Creek Rd"),])
summary(glm2brushrd)
## 
## Call:
## glm.nb(formula = user.count ~ rating_near, data = winter_travel[(winter_travel$has_sled == 
##     TRUE) & (winter_travel$Trailhead == "Brush Creek Rd"), ], 
##     init.theta = 0.05012781, link = log)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -0.4192  -0.4192  -0.3019  -0.3019   1.6795  
## 
## Coefficients:
##              Estimate Std. Error z value Pr(>|z|)   
## (Intercept)   -1.4307     0.5121  -2.794  0.00521 **
## rating_near2  -1.1686     0.6986  -1.673  0.09436 . 
## rating_near3  -0.8718     0.8307  -1.049  0.29396   
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for Negative Binomial(0.0501) family taken to be 1)
## 
##     Null deviance: 57.565  on 309  degrees of freedom
## Residual deviance: 54.482  on 307  degrees of freedom
##   (78 observations deleted due to missingness)
## AIC: 197.92
## 
## Number of Fisher Scoring iterations: 1
## 
## 
##               Theta:  0.0501 
##           Std. Err.:  0.0184 
## 
##  2 x log-likelihood:  -189.9240
glm2brushth <- glm.nb(user.count ~ rating_near, 
                   data=winter_travel[(winter_travel$has_sled==TRUE)&(winter_travel$Trailhead=="Brush Creek Trailhead"),])
summary(glm2brushth)
## 
## Call:
## glm.nb(formula = user.count ~ rating_near, data = winter_travel[(winter_travel$has_sled == 
##     TRUE) & (winter_travel$Trailhead == "Brush Creek Trailhead"), 
##     ], init.theta = 0.0557544471, link = log)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -0.4045  -0.2330  -0.2256  -0.2256   2.3487  
## 
## Coefficients:
##               Estimate Std. Error z value Pr(>|z|)  
## (Intercept)    -1.6818     0.7363  -2.284   0.0224 *
## rating_near2   -1.7522     0.9262  -1.892   0.0585 .
## rating_near3   -1.6716     0.9314  -1.795   0.0727 .
## rating_near4  -17.6208  2107.8559  -0.008   0.9933  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for Negative Binomial(0.0558) family taken to be 1)
## 
##     Null deviance: 47.436  on 360  degrees of freedom
## Residual deviance: 41.153  on 357  degrees of freedom
##   (103 observations deleted due to missingness)
## AIC: 128.39
## 
## Number of Fisher Scoring iterations: 1
## 
## 
##               Theta:  0.0558 
##           Std. Err.:  0.0325 
## 
##  2 x log-likelihood:  -118.3850
glm2gothic <- glm.nb(user.count ~ rating_near, 
                  data=winter_travel[(winter_travel$has_sled==TRUE)&(winter_travel$Trailhead=="Gothic Rd"),])
summary(glm2gothic)
## 
## Call:
## glm.nb(formula = user.count ~ rating_near, data = winter_travel[(winter_travel$has_sled == 
##     TRUE) & (winter_travel$Trailhead == "Gothic Rd"), ], init.theta = 0.1723991875, 
##     link = log)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -0.5975  -0.5904  -0.5659  -0.5659   3.0120  
## 
## Coefficients:
##                Estimate Std. Error z value Pr(>|z|)    
## (Intercept)    -1.19942    0.24991  -4.799 1.59e-06 ***
## rating_near2   -0.13248    0.30370  -0.436    0.663    
## rating_near3    0.03846    0.33447   0.115    0.908    
## rating_near4  -18.10317 2107.85569  -0.009    0.993    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for Negative Binomial(0.1724) family taken to be 1)
## 
##     Null deviance: 288.93  on 669  degrees of freedom
## Residual deviance: 281.83  on 666  degrees of freedom
##   (106 observations deleted due to missingness)
## AIC: 823.63
## 
## Number of Fisher Scoring iterations: 1
## 
## 
##               Theta:  0.1724 
##           Std. Err.:  0.0304 
## 
##  2 x log-likelihood:  -813.6310
glm2cement <- glm.nb(user.count ~ rating_near, 
                  data=winter_travel[(winter_travel$has_sled==TRUE)&(winter_travel$Trailhead=="Cement Creek"),])
summary(glm2cement)
## 
## Call:
## glm.nb(formula = user.count ~ rating_near, data = winter_travel[(winter_travel$has_sled == 
##     TRUE) & (winter_travel$Trailhead == "Cement Creek"), ], init.theta = 0.5804415395, 
##     link = log)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -1.4035  -1.2659  -0.5091   0.3274   2.0827  
## 
## Coefficients:
##              Estimate Std. Error z value Pr(>|z|)    
## (Intercept)   0.88446    0.16041   5.514 3.51e-08 ***
## rating_near2  0.06602    0.18682   0.353   0.7238    
## rating_near3 -0.33761    0.19837  -1.702   0.0888 .  
## rating_near4 -1.04698    0.41315  -2.534   0.0113 *  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for Negative Binomial(0.5804) family taken to be 1)
## 
##     Null deviance: 527.70  on 501  degrees of freedom
## Residual deviance: 514.57  on 498  degrees of freedom
##   (132 observations deleted due to missingness)
## AIC: 1958.9
## 
## Number of Fisher Scoring iterations: 1
## 
## 
##               Theta:  0.5804 
##           Std. Err.:  0.0586 
## 
##  2 x log-likelihood:  -1948.8960
glm2kebler <- glm.nb(user.count ~ rating_near, 
                  data=winter_travel[(winter_travel$has_sled==TRUE)&(winter_travel$Trailhead=="Kebler"),])
summary(glm2kebler)
## 
## Call:
## glm.nb(formula = user.count ~ rating_near, data = winter_travel[(winter_travel$has_sled == 
##     TRUE) & (winter_travel$Trailhead == "Kebler"), ], init.theta = 0.497445695, 
##     link = log)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -2.1331  -1.1530  -0.3222   0.3485   1.8286  
## 
## Coefficients:
##              Estimate Std. Error z value Pr(>|z|)    
## (Intercept)    3.8650     0.1519  25.440  < 2e-16 ***
## rating_near2  -0.1968     0.1781  -1.105  0.26907    
## rating_near3  -0.2424     0.1912  -1.267  0.20503    
## rating_near4  -1.2958     0.3729  -3.475  0.00051 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for Negative Binomial(0.4974) family taken to be 1)
## 
##     Null deviance: 601.85  on 492  degrees of freedom
## Residual deviance: 592.32  on 489  degrees of freedom
##   (173 observations deleted due to missingness)
## AIC: 4462.8
## 
## Number of Fisher Scoring iterations: 1
## 
## 
##               Theta:  0.4974 
##           Std. Err.:  0.0310 
## 
##  2 x log-likelihood:  -4452.7740
glm2slate <- glm.nb(user.count ~ rating_near, 
                  data=winter_travel[(winter_travel$has_sled==TRUE)&(winter_travel$Trailhead=="Slate River Rd"),])
summary(glm2slate)
## 
## Call:
## glm.nb(formula = user.count ~ rating_near, data = winter_travel[(winter_travel$has_sled == 
##     TRUE) & (winter_travel$Trailhead == "Slate River Rd"), ], 
##     init.theta = 1.035728402, link = log)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -2.0900  -0.9393  -0.2160   0.3116   4.0995  
## 
## Coefficients:
##              Estimate Std. Error z value Pr(>|z|)    
## (Intercept)   2.01432    0.09775  20.606  < 2e-16 ***
## rating_near2 -0.19287    0.11642  -1.657  0.09759 .  
## rating_near3 -0.38399    0.12652  -3.035  0.00241 ** 
## rating_near4 -2.61216    0.38567  -6.773 1.26e-11 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for Negative Binomial(1.0357) family taken to be 1)
## 
##     Null deviance: 742.48  on 596  degrees of freedom
## Residual deviance: 691.03  on 593  degrees of freedom
##   (181 observations deleted due to missingness)
## AIC: 3372.8
## 
## Number of Fisher Scoring iterations: 1
## 
## 
##               Theta:  1.0357 
##           Std. Err.:  0.0766 
## 
##  2 x log-likelihood:  -3362.8170
glm2snodgrass <- glm.nb(user.count ~ rating_near, 
                  data=winter_travel[(winter_travel$has_sled==TRUE)&(winter_travel$Trailhead=="Snodgrass"),])
summary(glm2snodgrass)
## 
## Call:
## glm.nb(formula = user.count ~ rating_near, data = winter_travel[(winter_travel$has_sled == 
##     TRUE) & (winter_travel$Trailhead == "Snodgrass"), ], init.theta = 0.1674608948, 
##     link = log)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -0.8989  -0.7272  -0.7195  -0.5130   1.7779  
## 
## Coefficients:
##              Estimate Std. Error z value Pr(>|z|)   
## (Intercept)    0.5319     0.3736   1.424  0.15451   
## rating_near2  -0.9710     0.4300  -2.258  0.02392 * 
## rating_near3  -1.0131     0.4385  -2.310  0.02087 * 
## rating_near4  -2.1413     0.8295  -2.581  0.00984 **
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for Negative Binomial(0.1675) family taken to be 1)
## 
##     Null deviance: 218.98  on 376  degrees of freedom
## Residual deviance: 208.91  on 373  degrees of freedom
##   (399 observations deleted due to missingness)
## AIC: 761.24
## 
## Number of Fisher Scoring iterations: 1
## 
## 
##               Theta:  0.1675 
##           Std. Err.:  0.0266 
## 
##  2 x log-likelihood:  -751.2350
glm2washington <- glm.nb(user.count ~ rating_near, 
                  data=winter_travel[(winter_travel$has_sled==TRUE)&(winter_travel$Trailhead=="Washington Gulch"),])
summary(glm2washington)
## 
## Call:
## glm.nb(formula = user.count ~ rating_near, data = winter_travel[(winter_travel$has_sled == 
##     TRUE) & (winter_travel$Trailhead == "Washington Gulch"), 
##     ], init.theta = 0.4299627548, link = log)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -1.3892  -1.3396  -0.3314   0.2116   2.7192  
## 
## Coefficients:
##              Estimate Std. Error z value Pr(>|z|)    
## (Intercept)    1.0770     0.1380   7.804 6.02e-15 ***
## rating_near2   0.2111     0.1650   1.279    0.201    
## rating_near3   0.0335     0.1805   0.186    0.853    
## rating_near4  -2.4632     0.5791  -4.254 2.10e-05 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for Negative Binomial(0.43) family taken to be 1)
## 
##     Null deviance: 700.48  on 673  degrees of freedom
## Residual deviance: 677.87  on 670  degrees of freedom
##   (104 observations deleted due to missingness)
## AIC: 2960.4
## 
## Number of Fisher Scoring iterations: 1
## 
## 
##               Theta:  0.4300 
##           Std. Err.:  0.0331 
## 
##  2 x log-likelihood:  -2950.4220

3. What combination of weather conditions yields the highest/lowest number of visitors?

ggplot(winter_travel, aes(air_temp, past5snow, size = user.count, color = rating_near))+
  geom_jitter(height = 0.2, width = 0.2, alpha = 0.7)+
  scale_color_manual(values = c("green", "yellow", "orange", "red"))

ggplot(winter_travel, aes(air_temp, change_depth, size = user.count, color = rating_near))+
  geom_jitter(height = 0.2, width = 0.2, alpha = 0.7)+
  scale_color_manual(values = c("green", "yellow", "orange", "red"))

ggplot(winter_travel, aes(air_temp, past5snow, size = user.count, color = rating_near))+
  geom_jitter(height = 0, width = 0.2, alpha = 0.7)+
  scale_color_manual(values = c("green", "yellow", "orange", "red"))+
  facet_grid(modality~Trailhead)

glm3 <- glm.nb(user.count~air_temp+change_depth+snow_depth, data=winter_travel)
summary(glm3)
## 
## Call:
## glm.nb(formula = user.count ~ air_temp + change_depth + snow_depth, 
##     data = winter_travel, init.theta = 0.1909529704, link = log)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -1.2676  -1.2052  -0.7061  -0.0697   4.4103  
## 
## Coefficients:
##               Estimate Std. Error z value Pr(>|z|)    
## (Intercept)   2.259017   0.098000  23.051  < 2e-16 ***
## air_temp      0.010527   0.003252   3.237  0.00121 ** 
## change_depth -0.022163   0.012109  -1.830  0.06722 .  
## snow_depth   -0.010435   0.001754  -5.949  2.7e-09 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for Negative Binomial(0.191) family taken to be 1)
## 
##     Null deviance: 8018.5  on 8667  degrees of freedom
## Residual deviance: 7976.1  on 8664  degrees of freedom
##   (1852 observations deleted due to missingness)
## AIC: 44328
## 
## Number of Fisher Scoring iterations: 1
## 
## 
##               Theta:  0.19095 
##           Std. Err.:  0.00350 
## 
##  2 x log-likelihood:  -44318.48900

This ended up being basically the same as question 1, since the past days’ snow didn’t show significance.

4. What sample size is needed to represent the entire season ?

This depends on the question, and the effect size that we care about.

5. Which days of the week have the highest number of visitors? Weekdays vs weekends?

glm5 <- glm.nb(user.count~weekend, data=winter_travel)
summary(glm5)
## 
## Call:
## glm.nb(formula = user.count ~ weekend, data = winter_travel, 
##     init.theta = 0.1926010033, link = log)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -1.2637  -1.1843  -0.7577  -0.0739   3.7320  
## 
## Coefficients:
##                Estimate Std. Error z value Pr(>|z|)    
## (Intercept)     1.96757    0.02944  66.829   <2e-16 ***
## weekendweekend  0.51481    0.05443   9.458   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for Negative Binomial(0.1926) family taken to be 1)
## 
##     Null deviance: 8073.7  on 8667  degrees of freedom
## Residual deviance: 7978.6  on 8666  degrees of freedom
##   (1852 observations deleted due to missingness)
## AIC: 44272
## 
## Number of Fisher Scoring iterations: 1
## 
## 
##               Theta:  0.19260 
##           Std. Err.:  0.00353 
## 
##  2 x log-likelihood:  -44266.37600
glm5a <- glm.nb(user.count~week, data=winter_travel)
summary(glm5a)
## 
## Call:
## glm.nb(formula = user.count ~ week, data = winter_travel, init.theta = 0.1926929823, 
##     link = log)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -1.2729  -1.1858  -0.7428  -0.0550   3.6116  
## 
## Coefficients:
##             Estimate Std. Error z value Pr(>|z|)    
## (Intercept)  2.01370    0.06595  30.535  < 2e-16 ***
## weekMon     -0.09208    0.09240  -0.997    0.319    
## weekSat      0.52889    0.09218   5.738 9.60e-09 ***
## weekSun      0.40340    0.09264   4.354 1.33e-05 ***
## weekThu     -0.05289    0.09360  -0.565    0.572    
## weekTue     -0.04771    0.09327  -0.511    0.609    
## weekWed     -0.03837    0.09336  -0.411    0.681    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for Negative Binomial(0.1927) family taken to be 1)
## 
##     Null deviance: 8076.7  on 8667  degrees of freedom
## Residual deviance: 7978.7  on 8661  degrees of freedom
##   (1852 observations deleted due to missingness)
## AIC: 44279
## 
## Number of Fisher Scoring iterations: 1
## 
## 
##               Theta:  0.19269 
##           Std. Err.:  0.00353 
## 
##  2 x log-likelihood:  -44263.48100

Saturdays are about 13% busier than Sundays, weekdays are about half as busy as weekends.

6. How do visitation rates for different user groups vary seasonally?

ggplot(winter_travel, aes(year, user.count))+
  geom_jitter(height = 0, width = 0.2, alpha = 0.7)+
  facet_grid(modality~Trailhead)+
  scale_y_log10()

7. Which days of the year are busiest?

day_totals <- all_users %>% 
  group_by(date, week, weekend) %>%
  summarise(users = sum(user.count, na.rm = TRUE)) %>%
  arrange(desc(users))
## `summarise()` regrouping output by 'date', 'week' (override with `.groups` argument)
head(day_totals, 10)
## # A tibble: 10 x 4
## # Groups:   date, week [10]
##    date       week  weekend users
##    <fct>      <fct> <fct>   <int>
##  1 2018-02-17 Sat   weekend   819
##  2 2018-02-18 Sun   weekend   744
##  3 2018-03-03 Sat   weekend   665
##  4 2018-01-27 Sat   weekend   659
##  5 2018-12-29 Sat   weekend   623
##  6 2018-03-26 Mon   weekday   586
##  7 2018-12-30 Sun   weekend   568
##  8 2019-02-23 Sat   weekend   568
##  9 2018-03-11 Sun   weekend   544
## 10 2018-03-21 Wed   weekday   517
ggplot(day_totals, aes((yday(date)+30)%%365, users))+
  geom_smooth()+
  geom_point(aes(color=weekend))+
  scale_x_continuous(breaks = c(0,31,62,90,120), 
                     labels = c("Dec 1", "Jan 1", "Feb 1", "Mar 1", "Apr 1"))+
  labs(x="")
## `geom_smooth()` using method = 'loess' and formula 'y ~ x'

Jump in usage in the last week of the calendar year, otherwise gradual increase until sometime mid March.

8. Do different groups of users utilize the trail more during different times of the season?

day_totals_modes <- all_locations %>% 
  group_by(date, week, weekend, modality, has_sled) %>%
  summarise(users = sum(user.count, na.rm = TRUE)) %>%
  arrange(desc(users))
## `summarise()` regrouping output by 'date', 'week', 'weekend', 'modality' (override with `.groups` argument)
head(day_totals_modes, 10)
## # A tibble: 10 x 6
## # Groups:   date, week, weekend, modality [10]
##    date       week  weekend modality      has_sled users
##    <fct>      <fct> <fct>   <fct>         <lgl>    <int>
##  1 2018-01-27 Sat   weekend Non.motorized FALSE      601
##  2 2018-02-17 Sat   weekend Non.motorized FALSE      557
##  3 2018-02-18 Sun   weekend Non.motorized FALSE      514
##  4 2018-03-03 Sat   weekend Non.motorized FALSE      468
##  5 2019-02-23 Sat   weekend Non.motorized FALSE      445
##  6 2018-03-21 Wed   weekday Non.motorized FALSE      387
##  7 2017-12-30 Sat   weekend Non.motorized FALSE      367
##  8 2018-03-26 Mon   weekday Non.motorized FALSE      356
##  9 2020-03-15 Sun   weekend Non.motorized FALSE      342
## 10 2018-12-29 Sat   weekend Non.motorized FALSE      331
ggplot(day_totals_modes, aes((yday(date)+30)%%365, users))+
  geom_smooth()+
  geom_point(aes(color=weekend))+
  scale_x_continuous(breaks = c(0,31,62,90,120), 
                     labels = c("Dec 1", "Jan 1", "Feb 1", "Mar 1", "Apr 1"))+
  scale_y_log10()+
  labs(x="")+
  facet_grid(modality~1)
## `geom_smooth()` using method = 'loess' and formula 'y ~ x'

day_totals_modes2 <- all_locations %>% 
  group_by(date, week, weekend, has_sled) %>%
  summarise(users = sum(user.count, na.rm = TRUE)) %>%
  arrange(desc(users))
## `summarise()` regrouping output by 'date', 'week', 'weekend' (override with `.groups` argument)
ggplot(day_totals_modes2, aes((yday(date)+30)%%365, users))+
  geom_smooth()+
  geom_point(aes(color=weekend))+
  scale_x_continuous(breaks = c(0,31,62,90,120), 
                     labels = c("Dec 1", "Jan 1", "Feb 1", "Mar 1", "Apr 1"))+
  scale_y_log10()+
  labs(x="")+
  facet_grid(has_sled~1)
## `geom_smooth()` using method = 'loess' and formula 'y ~ x'

Variance is greater in the motorized groups, and the end of year and mid March peaks are more pronounced.

gam6 <- gam(user.count ~ modality+s((yday(date)+30)%%365), family = nb(), data=all_locations)
summary(gam6)
## 
## Family: Negative Binomial(1.089) 
## Link function: log 
## 
## Formula:
## user.count ~ modality + s((yday(date) + 30)%%365)
## 
## Parametric coefficients:
##                       Estimate Std. Error z value Pr(>|z|)    
## (Intercept)            1.94062    0.05236  37.063  < 2e-16 ***
## modalityMechanized    -0.60637    0.07610  -7.968 1.62e-15 ***
## modalityMotorized      2.11499    0.07174  29.479  < 2e-16 ***
## modalityNon.motorized  2.78623    0.07159  38.921  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Approximate significance of smooth terms:
##                             edf Ref.df Chi.sq p-value    
## s((yday(date) + 30)%%365) 8.014   8.74  124.2  <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## R-sq.(adj) =  0.529   Deviance explained = 59.5%
## -REML = 6353.6  Scale est. = 1         n = 1556

9. How did covid affect visitation rates and use patterns?

march_on <- all_users[month(all_users$date)%in%3:4,]
ggplot(march_on, aes((yday(march_on$date)+30%%365), user.count, color=year))+
  geom_point()+geom_smooth()+
  scale_x_continuous(breaks = c(90,122), 
                     labels = c("Mar 1", "Apr 1"))+
  scale_y_log10()+
  labs(x="")+
  facet_grid(has_sled~1)
## `geom_smooth()` using method = 'loess' and formula 'y ~ x'

I don’t see anything.

11. How do the days with the highest avalanche risk affect visitor rates?

high_risk <- as.numeric(winter_travel$rating_above)+
  as.numeric(winter_travel$rating_near)+
  as.numeric(winter_travel$rating_below) > 9
summary(glm.nb(user.count~high_risk, data=winter_travel))
## 
## Call:
## glm.nb(formula = user.count ~ high_risk, data = winter_travel, 
##     init.theta = 0.1914928382, link = log)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -1.2142  -1.2142  -0.6832  -0.0954   4.3940  
## 
## Coefficients:
##               Estimate Std. Error z value Pr(>|z|)    
## (Intercept)    2.17484    0.02532  85.890   <2e-16 ***
## high_riskTRUE -1.22998    0.14481  -8.494   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for Negative Binomial(0.1915) family taken to be 1)
## 
##     Null deviance: 7972.2  on 8597  degrees of freedom
## Residual deviance: 7919.7  on 8596  degrees of freedom
##   (1922 observations deleted due to missingness)
## AIC: 44043
## 
## Number of Fisher Scoring iterations: 1
## 
## 
##               Theta:  0.19149 
##           Std. Err.:  0.00352 
## 
##  2 x log-likelihood:  -44037.24100

On high risk days the usage is about 30% of the average on days that are not high risk.